import numpy as np
import pandas as pd
import calendar
import datetime
import os
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we go analyze the Instacart Market Basket Analysis Dataset from Kaggle. You can download the data from Kaggle.com or from instacart.com. The data includes the following files
The dataset for this competition is a relational set of files describing customers' orders over time. The goal of the competition is to predict which products will be in a user's next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of the day the order was placed and a relative measure of time between orders. For more information, see the blog post accompanying its public release.
The data dictionary is available here.
orders (3.4m rows, 206k users):
order_id: order identifieruser_id: customer identifiereval_set: which evaluation set this order belongs in (see SET described below)order_number: the order sequence number for this user (1 = first, n = nth)order_dow: the day of the week the order was placed onorder_hour_of_day: the hour of the day the order was placed ondays_since_prior: days since the last order, capped at 30 (with NAs for order_number = 1)products (50k rows):
product_id: product identifierproduct_name: name of the productaisle_id: foreign keydepartment_id: foreign keyaisles (134 rows):
aisle_id: aisle identifieraisle: the name of the aisledeptartments (21 rows):
department_id: department identifierdepartment: the name of the departmentorder_products__SET (30m+ rows):
order_id: foreign keyproduct_id: foreign keyadd_to_cart_order: order in which each product was added to cartreordered: 1 if this product has been ordered by this user in the past, 0 otherwisewhere SET is one of the four following evaluation sets (eval_set in orders):
"prior": orders prior to that users most recent order (~3.2m orders)"train": training data supplied to participants (~131k orders)"test": test data reserved for machine learning competitions (~75k orders)def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)
PATH = 'Instacart'
Files = os.listdir(PATH)
Files.remove('sample_submission.csv')
Files_Info = pd.DataFrame()
for i in range(len(Files)):
Header(Files[i])
Temp = pd.read_csv(os.path.join(PATH, Files[i]))
display(Temp.head(5))
Files_Info = Files_Info.append(pd.DataFrame({'File':[Files[i]], 'Number of Instances': [Temp.shape[0]],
'Number of Attributes': [Temp.shape[1]]}), ignore_index = True)
filename = Files[i].split('.')[0]
globals() [filename] = pd.read_csv(os.path.join(PATH, Files[i]))
Line()
display(Files_Info.style.hide_index())
Line()
del Files, i, Temp
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(14, 6*3))
## The most ordered Products
Top_number = 15
Header('The Most Ordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered_Products = order_products_prior.groupby(['product_id'])['product_id'].agg({'count'})
Most_Ordered_Products.columns = ['Count']
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_Ordered_Products = pd.merge(Most_Ordered_Products, products, on='product_id', how='right')
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop=True).dropna()
Most_Ordered_Products['Count'] = Most_Ordered_Products['Count'].astype(int)
display(Most_Ordered_Products.head(Top_number))
_ = sns.barplot(ax = ax[0], y='product_name', x='Count', palette='PuBu',
edgecolor='SkyBlue', hatch="///", data=Most_Ordered_Products.head(Top_number))
_ = sns.barplot(ax = ax[0], y='product_name', x='Count', facecolor = 'None',
edgecolor='k',data=Most_Ordered_Products.head(Top_number))
_ = ax[0].set_title('Top %s Ordered Products' % Top_number)
_ = ax[0].set_xlim([0, 5e5])
_ = ax[0].set_ylabel('Products')
_ = ax[0].set_xlabel('Count')
## The most Common Order Size
Top_number = 30
Header('The Most Frequent Order Size', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered = order_products_prior.groupby(['order_id'])['product_id'].agg({'count'})
Most_Ordered.columns = ['Number of Products in an Order']
Most_Ordered = Most_Ordered.reset_index(drop = False).\
groupby(['Number of Products in an Order'])['Number of Products in an Order'].agg({'count'})
Most_Ordered.columns = ['Count']
Most_Ordered = Most_Ordered.reset_index(drop = False).sort_values(by='Count', ascending=False)
display(Most_Ordered.head(Top_number).set_index('Number of Products in an Order').T)
_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', palette='OrRd',
edgecolor='DarkOrange', hatch="///", data= Most_Ordered.head(Top_number))
_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', facecolor = 'None',
edgecolor='k', data= Most_Ordered.head(Top_number))
_ = ax[1].set_title('%s Most Frequent Order Size' % Top_number)
_ = ax[1].set_ylim([0, 2.5e5])
## The Most Reordered Products
Top_number = 15
Header('The Most Reordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Temp = order_products_prior.loc[order_products_prior.reordered ==1]
Most_ReOrdered_Products = Temp.groupby(['product_id'])['product_id'].agg({'count'})
Most_ReOrdered_Products.columns = ['Count']
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_ReOrdered_Products = pd.merge(Most_ReOrdered_Products, products, on='product_id', how='right')
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop=True).dropna()
Most_ReOrdered_Products['Count'] = Most_ReOrdered_Products['Count'].astype(int)
display(Most_ReOrdered_Products.head(Top_number))
_ = sns.barplot(ax = ax[2], y='product_name', x='Count', palette='PiYG',
edgecolor='SkyBlue', hatch="///", data=Most_ReOrdered_Products.head(Top_number))
_ = sns.barplot(ax = ax[2], y='product_name', x='Count', facecolor = 'None',
edgecolor='k',data=Most_ReOrdered_Products.head(Top_number))
_ = ax[2].set_title('Top %s Reordered Products' % Top_number)
_ = ax[2].set_xlim([0, 4.5e5])
_ = ax[2].set_ylabel('Products')
_ = ax[2].set_xlabel('Count')
Header('Orders Distributions (Week)', L = 100)
Orders_Distributions_Week = orders.groupby(['order_dow'])['order_dow'].agg({'count'})
Orders_Distributions_Week.index = calendar.day_name[0:7]
Orders_Distributions_Week.reset_index(inplace = True, drop = False)
Orders_Distributions_Week.columns =['Day of Week', 'Count']
display(Orders_Distributions_Week.set_index('Day of Week').T)
Header('Orders Distributions (Hours)', L = 100)
Orders_Distributions_Hours = orders.groupby(['order_hour_of_day'])['order_hour_of_day'].agg({'count'}).reset_index(drop = False)
Orders_Distributions_Hours.columns = ['Hour of Day', 'Count']
display(Orders_Distributions_Hours.set_index('Hour of Day').T)
fig = plt.figure(figsize=(13, 6), constrained_layout=True)
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])
# Left Plot
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', palette='summer',
edgecolor='LimeGreen', hatch="O", data= Orders_Distributions_Week)
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', facecolor = 'None',
edgecolor='k',data= Orders_Distributions_Week)
_ = ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)
_ = ax[0].set_ylim([0, 7e5])
# Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', palette='cool',
edgecolor='RoyalBlue', hatch="*", data= Orders_Distributions_Hours)
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', facecolor = 'None',
edgecolor='k',data= Orders_Distributions_Hours)
_ = ax[1].set_ylim([0, 3e5])
fig.tight_layout()
Most orders take place on Mondays and Tuesdays. Besides, between 9:00 AM and 5:00 PM, most orders take place daily.
Orders_Distributions_Total = (orders.pivot_table(index='order_dow', columns='order_hour_of_day',
values='order_id', aggfunc='count').fillna(0)/1000).round(2)
Orders_Distributions_Total.index=calendar.day_name[0:7]
Orders_Distributions_Total.T.index.name = 'Hour of Day'
display(Orders_Distributions_Total)
fig, ax = plt.subplots(figsize=(15,6))
_ = sns.heatmap(Orders_Distributions_Total, annot=True, cmap =sns.color_palette("RdYlGn", n_colors=10),
annot_kws={"size": 12}, linewidths=.5, ax=ax, vmin=0, vmax=60,
cbar_kws={ 'label': 'Number of Orders $\\times$ $10^3$', "aspect":30, "shrink": .95})
_ = ax.set_ylabel('Week Days')
_ = ax.set_xlabel('Hour of the day')
fig.tight_layout()
Between 9:00 AM and 5:00 PM, most orders take place on Mondays and Tuesdays.
Days_since_prior_order_summary = orders.days_since_prior_order.value_counts().sort_index().to_frame('Count').reset_index()
Days_since_prior_order_summary.columns = ['Days Since Prior Order', 'Count']
Days_since_prior_order_summary['Days Since Prior Order'] = Days_since_prior_order_summary['Days Since Prior Order'].astype(int)
#
fig = plt.figure(figsize=(13.5, 6))
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])
# Left Plot
_ = ax[0].boxplot(orders.days_since_prior_order.dropna(), 0, 'rs', 0)
_ = ax[0].set_xlabel('Days since prior order')
_ = ax[0].set_title('Distribution of The Number\nof Days Since Prior Order')
_ = ax[0].set_xlim([0, 20])
# Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='Days Since Prior Order', palette='summer',
edgecolor='DarkGreen', hatch="..", data= Days_since_prior_order_summary)
_ = sns.barplot(ax = ax[1], y='Count', x='Days Since Prior Order', facecolor = 'None',
edgecolor='k', data= Days_since_prior_order_summary)
_ = ax[1].set_ylim([0, 4e5])
_ = ax[1].set_xlabel('Days Since Prior Order')
_ = ax[1].set_title('Distribution of The Number\nof Days Since Prior Order')
fig.tight_layout()
Usually, customers order from 5 days to 15 days after the prior order. The majority of orders take place after a week and a month after the prior order.
Products_Detailed = pd.merge(left =pd.merge(left=products, right= departments, how='left'), right=aisles, how='left')
Products_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Products_Detailed.columns.tolist()]
Products_Detailed['Department'] = Products_Detailed['Department'].map(lambda x: x.title())
Products_Detailed['Aisle'] = Products_Detailed['Aisle'].map(lambda x: x.title())
Products_Detailed.head(10).style.hide_index()
Products_by_Department = Products_Detailed.groupby('Department')['Product ID'].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Department['Percentage'] = np.round(100* Products_by_Department['Total Products']\
/ Products_by_Department['Total Products'].sum(),2)
Products_by_Department.sort_values(by='Total Products', ascending=False, inplace=True)
Colors = ['LightGreen']
LC = 'DarkGreen'
fig = px.bar(Products_by_Department, x = 'Department', y='Total Products', text = 'Percentage',
color_discrete_sequence= Colors,
hover_data= Products_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 7e3])
fig.update_layout(title={'text': 'Total Products in Each Departments', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
It can be seen that Personal Care and Snacks are the best seller departments.
def Search_List(Key, List): return [s for s in List if Key in s]
myColors = list(mcolors.CSS4_COLORS.keys())
Temp = []
Temp.extend(Search_List('gray', myColors))
Temp.extend(Search_List('grey', myColors))
Temp.extend(Search_List('black', myColors))
myColors = list(set(myColors)-set(Temp))
#
Products_by_Aisle = Products_Detailed.groupby(["Department", "Aisle"])["Product ID"].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
/ Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)
Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
/Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)
# Plot
font = FontProperties()
font.set_weight('bold')
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
if Temp[j] <= m < Temp[j+1]:
m = Temp[j+1]
break
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0; Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
gridspec_kw = dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
if C in Ind:
_ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
else:
_ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set_title(aisle)
C +=1
_ = fig.suptitle(t = 'Product Distributions', y = .91,
fontproperties=font, fontsize = 16)
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
text = 'Total Products', color_discrete_sequence= Colors, hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 1.4e3])
fig.update_layout(title={'text': 'Total Products in Each Aisles', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Orders_Detailed = orders[['user_id', 'order_id']].merge(order_products_train[['order_id', 'product_id']],
how='inner', left_on='order_id', right_on='order_id')
Orders_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Orders_Detailed.columns.tolist()]
Orders_Detailed = Orders_Detailed.merge(Products_Detailed, how='inner', left_on='Product ID', right_on='Product ID')
#
Orders_by_Department = Orders_Detailed.groupby('Department')['Order ID'].count().\
to_frame('Total Orders').reset_index(drop = False)
Orders_by_Department['Percentage'] = np.round(100* Orders_by_Department['Total Orders']\
/ Orders_by_Department['Total Orders'].sum(),2)
Colors = ['MediumVioletRed']
LC = 'Indigo'
fig = px.bar(Orders_by_Department, x = 'Department', y= 'Percentage', text = 'Total Orders', color_discrete_sequence= Colors,
hover_data = Orders_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 30])
fig.update_layout(title={'text': 'Best Selling Departments', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()
Products_by_Aisle = Orders_Detailed.groupby(["Department", "Aisle"])["Product ID"].count().\
to_frame('Total Products').reset_index(drop = False)
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
/ Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)
Products_by_Aisle.head().style.hide_index()
Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Missing') &
(Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
/Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)
# Plot
font = FontProperties()
font.set_weight('bold')
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
if Temp[j] <= m < Temp[j+1]:
m = Temp[j+1]
break
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0;
Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
gridspec_kw = dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
_ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
if C in Ind:
_ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
else:
_ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
_ = ax.set_title(aisle)
C +=1
_ = fig.suptitle(t = 'Product Sale Distributions', y = .91,
fontproperties=font, fontsize = 16)
##
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
text = 'Total Products', color_discrete_sequence= Colors,
hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 2e5])
fig.update_layout(title={'text': 'Best Selling Aisles', 'x':0.5, 'y':0.94,
'xanchor': 'center', 'yanchor': 'top'})
fig.show()